Code
import pandas as pd
import altair as alt
import seaborn as sns
import plotly.express as px
from vega_datasets import data
import matplotlib.pyplot as pltAmelia Baier, Andrea Dukic, Mia Mayerhofer
| Decision | State | Count | |
|---|---|---|---|
| 0 | Admit | California | 9 |
| 1 | Admit | Colorado | 8 |
| 2 | Admit | Florida | 11 |
| 3 | Admit | Utah | 1 |
| 4 | Decline | California | 1 |
| 5 | Decline | Colorado | 6 |
| 6 | Decline | Florida | 13 |
| 7 | Decline | Mississippi | 1 |
| 8 | Decline | Oregon | 1 |
| 9 | Decline | Utah | 2 |
| 10 | Decline | Virginia | 4 |
| 11 | Waitlist | Alabama | 1 |
| 12 | Waitlist | California | 2 |
| 13 | Waitlist | Colorado | 4 |
| 14 | Waitlist | Florida | 11 |
| 15 | Waitlist | New York | 1 |
| 16 | Waitlist | Utah | 3 |
| 17 | Waitlist | Vermont | 1 |
Above are the number of students per state and decision. Note that for most states and decisions there are only a handful of students in each row. This means that the analysis conducted later cannot be representative of the entire population.
To provide an overview of the data, we will be looking at the data from a geographic perspective, specifically at the state level.
#calculate averages of all numeric columns
num_cols = df[['State', 'GPA', 'WorkExp', 'TestScore', 'WritingScore', 'VolunteerLevel']]
avg_df = num_cols.groupby('State').mean().reset_index()
state_abbr = {
'Alabama': 'AL',
'California': 'CA',
'Colorado': 'CO',
'Florida': 'FL',
'Georgia': 'GA',
'Mississippi': 'MS',
'New York': 'NY',
'Oregon': 'OR',
'Utah': 'UT',
'Vermont': 'VT',
'Virginia': 'VA'
}
avg_df['State_Abbr'] = avg_df['State'].map(state_abbr)
avg_df = avg_df.drop(columns=['State'])
avg_df = avg_df.rename(columns={'State_Abbr': 'State'})import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
fig = px.choropleth(avg_df, locationmode="USA-states",
locations=avg_df['State'],
scope="usa",
color=avg_df['GPA'],
hover_data={"State": True, "GPA": True},
labels={"GPA": "Selected Variable"},
color_continuous_scale=custom_palette
)
dropdown = []
for col in avg_df.columns[:-1]:
dropdown.append({'label': col, 'method': 'update', 'args': [{'z': [avg_df[col]]}]})
fig.update_layout(updatemenus=[{'buttons': dropdown, 'direction': 'down', 'showactive': True}],
title='Choropleth Map of Average Selected Variable')
fig.update_coloraxes(colorbar_title=dict(text='Selected Variable'))
fig.show()Above is a choropleth map of the average numeric feature (GPA, test score, writing score, work experience in years, and volunteer level) by state. The average of the numeric features is calculated across all decision types to obtain a holistic view of the student data by state. Below we will summarize some findings for each feature:
| GPA | Test Score | Writing Score |
|---|---|---|
| California has the highest average GPA, with Florida and New York close behind. | California has the highest average test score. | California has the highest average writing score. |
| Oregon and Mississippi have the lowest average GPA. | Mississippi has the lowest average test score. | New York has the lowest average writing score. |
| Work Experience | Volunteer Level | |
| Mississippi has the highest average work experience in years. | Oregon has the highest average volunteer level. | |
| Oregon has the lowest average work experience. | Alabama has the lowest average volunteer level. |
We can also look at some of these features at the geographic level by decision.
admit = df[df['Decision'] == 'Admit']
num_cols = admit[['State', 'GPA', 'WorkExp', 'TestScore', 'WritingScore', 'VolunteerLevel']]
avg_admit = num_cols.groupby('State').mean().reset_index()
avg_admit['State_Abbr'] = avg_admit['State'].map(state_abbr)
decline = df[df['Decision'] == 'Decline']
num_cols = decline[['State', 'GPA', 'WorkExp', 'TestScore', 'WritingScore', 'VolunteerLevel']]
avg_decline = num_cols.groupby('State').mean().reset_index()
avg_decline['State_Abbr'] = avg_decline['State'].map(state_abbr)import altair as alt
from vega_datasets import data
state_id_dict = dict(zip(data.population_engineers_hurricanes()["state"], data.population_engineers_hurricanes()["id"]))
avg_admit["StateID"] = avg_admit["State"].map(state_id_dict)
avg_decline["StateID"] = avg_decline["State"].map(state_id_dict)
states = alt.topo_feature('https://raw.githubusercontent.com/vega/vega-datasets/master/data/us-10m.json', 'states')
click = alt.selection_multi(fields = ["State"])
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("GPA:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "GPA:Q"],
opacity = alt.condition('isValid(datum.GPA)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(avg_admit, "StateID", list(avg_admit.columns))
).properties(width = 333, height = 200, title="Average Admitted GPA by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.GPA)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
admit_gpa = existing_states + missing_states
admit_gpa = admit_gpa.encode(
tooltip= ["State:N", "GPA:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(avg_admit, "StateID", list(avg_admit.columns))
).interactive()
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("GPA:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "GPA:Q"],
opacity = alt.condition('isValid(datum.GPA)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(avg_decline, "StateID", list(avg_decline.columns))
).properties(width = 333, height = 200, title="Average Declined GPA by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.GPA)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
decline_gpa = existing_states + missing_states
decline_gpa = decline_gpa.encode(
tooltip= ["State:N", "GPA:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(avg_decline, "StateID", list(avg_decline.columns))
).interactive()
admit_gpa | decline_gpaexisting_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("TestScore:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "TestScore:Q"],
opacity = alt.condition('isValid(datum.TestScore)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(avg_admit, "StateID", list(avg_admit.columns))
).properties(width = 333, height = 200, title="Average Admitted Test Score by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.TestScore)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
admit_test = existing_states + missing_states
admit_test = admit_test.encode(
tooltip= ["State:N", "TestScore:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(avg_admit, "StateID", list(avg_admit.columns))
).interactive()
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("TestScore:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "TestScore:Q"],
opacity = alt.condition('isValid(datum.TestScore)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(avg_decline, "StateID", list(avg_decline.columns))
).properties(width = 333, height = 200, title="Average Declined Test Score by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.TestScore)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
decline_test = existing_states + missing_states
decline_test = decline_test.encode(
tooltip= ["State:N", "TestScore:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(avg_decline, "StateID", list(avg_decline.columns))
).interactive()
admit_test | decline_testAs we can see from the average GPA and test scores for admitted and declined students by state, students who were admitted had higher GPAs and test scores than those who were declined.
This insight can help us to improve students’ test scores so as to increase their chances of being admitted to an internship.
We can also see the rates of students admitted and declined from internships by state to see overall how successful are the students from the selected states.
#create dataframe of rates for each state by decision
decision_state = df.groupby(['Decision', 'State'])[["GPA"]].count().reset_index()
decision_state = decision_state.rename(columns={'GPA':'StateCount'})
decision_state['DecisionCount'] = decision_state.groupby('Decision')['StateCount'].transform('sum')
decision_state['Rate'] = decision_state['StateCount'] / decision_state['DecisionCount'] * 100
state_id_dict = dict(zip(data.population_engineers_hurricanes()["state"], data.population_engineers_hurricanes()["id"]))
decision_state["StateID"] = decision_state["State"].map(state_id_dict)
admit_states = decision_state[decision_state['Decision'] == "Admit"]
decline_states = decision_state[decision_state['Decision'] == "Decline"]states = alt.topo_feature('https://raw.githubusercontent.com/vega/vega-datasets/master/data/us-10m.json', 'states')
click = alt.selection_multi(fields = ["State"])
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("Rate:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "Rate:Q"],
opacity = alt.condition('isValid(datum.Rate)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(admit_states, "StateID", list(admit_states.columns))
).properties(width = 333, height = 200, title="Admission Rates by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.Rate)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
admit_map = existing_states + missing_states
admit_map = admit_map.encode(
tooltip= ["State:N", "Rate:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(admit_states, "StateID", list(admit_states.columns))
).interactive()
existing_states = alt.Chart(states).mark_geoshape(stroke='black').encode(
color = alt.Color("Rate:Q", scale=alt.Scale(range=custom_palette)),
tooltip = ["State:N", "Rate:Q"],
opacity = alt.condition('isValid(datum.Rate)', alt.value(1), alt.value(0.2)),
).transform_lookup(
lookup = "id",
from_ = alt.LookupData(decline_states, "StateID", list(decline_states.columns))
).properties(width = 333, height = 200, title="Rejection Rates by State").add_selection(click).project(type = "albersUsa").interactive()
missing_states = (
alt.Chart(states)
.mark_geoshape(fill = "grey", stroke = "white")
.encode(opacity=alt.condition("isValid(datum.Rate)", alt.value(0), alt.value(0.2))).add_selection(click).project(type = "albersUsa")
)
decline_map = existing_states + missing_states
decline_map = decline_map.encode(
tooltip= ["State:N", "Rate:Q"]
).transform_lookup(
lookup="id",
from_=alt.LookupData(decline_states, "StateID", list(decline_states.columns))
).interactive()
admit_map | decline_mapAbove are the maps of the rates of the students admitted by state and the rates of the students declined by state. Some findings from the maps are:
There isn’t a clear relationship between admissions and rejections by state, which means that the state a student is from is not pivotal in the decision of their internship application.
Before applying machine learning, below we will explain the reasoning behind applying ML to the student admissions data. We wanted to identify relationships between the academic features in the data on the decision, namely GPA, writing score, and test score. The motivation behind this is to use the information of the relationships, if any are present, to help students in our university understand what features of their application might contribute to the decision. We then hope to provide targeted help to our students based on our findings to increase the number of students admitted to internships.
Above is the pairplot of GPA, writing score, and test score of the students grouped by the decision. When looking at the scatterplots, we notice some patterns:
Through the pairplot we can see that some of the academic features have relationships by decision result, but some features seem to be more important than others. We will investigate the importance of all features on decision using XGBoost and Shapley values.